library(tm)
library(Matrix)	

# set working directory
setwd("~/Dropbox/Shared Folders/Tom Clark - LDA+IRT/Code for Paper/")	


# pick subset
start.volume <- 329
stop.volume <- 546

# set topic number

# load data
load(file=paste('OpinionTextData/opinionsFormattedForTopicmodels',start.volume,stop.volume,'.RData', sep=""))
load('SpaethData/SCDB_2011_03_justiceCentered_Citation.Rdata')
load('SpaethData/SCDB_2011_03_caseCentered_Citation.Rdata')

# create identfiers to link to Spaeth
foo <- strsplit(working.list.of.opinions, split=".", fixed=TRUE)
citation <- NULL
docketNo <- NULL
for(i in 1:length(working.list.of.opinions)){
	citation[i] <- paste(foo[[i]][1], "U.S.", foo[[i]][3], sep=" ")
	docketNo[i] <- foo[[i]][4]
}
bar <- match(citation, as.character(SCDB_2011_03_caseCentered_Citation$usCite))



# create matrix with citations, justices' votes
totalJustices <- length(table(SCDB_2011_03_justiceCentered_Citation$justice))
SCDB_2011_03_justiceCentered_Citation$tempJusticeId <- as.numeric(as.factor(SCDB_2011_03_justiceCentered_Citation$justice))
justiceNameVector <- rep('NA', length=totalJustices)
for(j in 1:totalJustices){
	justiceNameVector[j] <- SCDB_2011_03_justiceCentered_Citation$justiceName[min(which(SCDB_2011_03_justiceCentered_Citation$tempJusticeId==j))]
}


voteMatrix <- matrix(NA, nrow=dim(Opinions_dtm)[1], ncol=totalJustices)

numericvec <- rep(NA, dim(Opinions_dtm)[1])
charactervec <- rep(NA, dim(Opinions_dtm)[1])
voteDetails <- data.frame(numericvec, numericvec, numericvec, numericvec, charactervec, charactervec, stringsAsFactors=FALSE)

for(i in 1:dim(voteMatrix)[1]){
	tempVotes <- SCDB_2011_03_justiceCentered_Citation[which(SCDB_2011_03_justiceCentered_Citation$usCite==citation[i]),]
	if(nrow(tempVotes) > 0){
		voteDetails[i,1] <- tempVotes$issueArea[1]
		voteDetails[i,2] <- tempVotes$issue[1]
		voteDetails[i,3] <- tempVotes$naturalCourt[1]
		voteDetails[i,4] <- tempVotes$term[1]		
		voteDetails[i,5] <- as.character(tempVotes$dateDecision[1])
		voteDetails[i,6] <- as.character(tempVotes$caseName[1])
		for (j in 1:dim(tempVotes)[1]){
			voteMatrix[i, tempVotes$tempJusticeId[j]] <- ifelse(tempVotes$majority[j]==2, 1,ifelse(tempVotes$majority[j]==1, 0, NA))
		}		
	}
}
colnames(voteMatrix) <- justiceNameVector
colnames(voteDetails) <- c("IssueArea","Issue","NaturalCourt","Term","DecisionDate","CaseName")

rownames(voteMatrix) <- citation
Opinions_dtm$dimnames$Docs <- citation

WordSparseMatrix <- Opinions_dtm
rm(Opinions_dtm)

VoteMatrix <- voteMatrix
rm(voteMatrix)

VoteDetails <- voteDetails
rm(voteDetails)

## Select Only Cases With Non-Unanimous Voting Data ##

# HasNonUnanimousVoteData <- rowSums(is.na(VoteMatrix)) < 36 & (rowMeans(VoteMatrix,na.rm=TRUE) < 1)
# KeepSparseMatrixEntry <- HasNonUnanimousVoteData[WordSparseMatrix$i]

## Select Only Cases With Non-Unanimous Voting Data and Rehnquist on the Court ##

#RehnquistOnCourt <- min(which(!is.na(VoteMatrix[,24]))) <= 1:dim(VoteMatrix)[1] 
#HasNonUnanimousVoteData <- rowSums(is.na(VoteMatrix)) < 36 & (rowMeans(VoteMatrix,na.rm=TRUE) < 1) & RehnquistOnCourt
#KeepSparseMatrixEntry <- HasNonUnanimousVoteData[WordSparseMatrix$i]

## Select Only Cases With Non-Unanimous Voting Data and Starting When Minton Joins in 1949 ##

AfterMintonBeforeRoberts <- min(which(!is.na(VoteMatrix[,11]))) <= 1:dim(VoteMatrix)[1] & min(which(!is.na(VoteMatrix[,33]))) > 1:dim(VoteMatrix)[1] 
HasNonUnanimousVoteData <- rowSums(is.na(VoteMatrix)) < 36 & (rowMeans(VoteMatrix,na.rm=TRUE) < 1) & AfterMintonBeforeRoberts
KeepSparseMatrixEntry <- HasNonUnanimousVoteData[WordSparseMatrix$i]


# Remove Extraneous Cases
NonUnanimousReMapCases <- which(HasNonUnanimousVoteData)
NonUnanimousReMap <- rep(NA, length(HasNonUnanimousVoteData))
for (i in 1:length(HasNonUnanimousVoteData)){
	temp <- which(i == NonUnanimousReMapCases)
	if (length(temp) == 1) NonUnanimousReMap[i] <- temp
	}
WordSparseMatrix$i <- WordSparseMatrix$i[KeepSparseMatrixEntry]
WordSparseMatrix$i <- NonUnanimousReMap[WordSparseMatrix$i]
WordSparseMatrix$j <- WordSparseMatrix$j[KeepSparseMatrixEntry]
WordSparseMatrix$v <- WordSparseMatrix$v[KeepSparseMatrixEntry]
WordSparseMatrix$nrow <- length(NonUnanimousReMapCases)
WordSparseMatrix$dimnames$Docs <- citation[HasNonUnanimousVoteData]

VoteMatrix <- VoteMatrix[HasNonUnanimousVoteData,]
VoteMatrix <- VoteMatrix[,which(colSums(!is.na(VoteMatrix)) > 10)]
VoteDetails <- VoteDetails[HasNonUnanimousVoteData,]

## Identify Absent and Singleton Terms and Remove Them ##

KeepTerms <- tabulate(WordSparseMatrix$j) > 1
KeepTermIDs <- which(KeepTerms)

KeepTermReMap <- rep(NA, length(KeepTerms))
for (i in 1:length(KeepTerms)){
	temp <- which(i == KeepTermIDs)
	if (length(temp) == 1) KeepTermReMap[i] <- temp
	}
	
KeepTermLocs <- is.element(WordSparseMatrix$j,KeepTermIDs)
	
WordSparseMatrix$i <- WordSparseMatrix$i[KeepTermLocs]
WordSparseMatrix$j <- WordSparseMatrix$j[KeepTermLocs]	
WordSparseMatrix$j <- KeepTermReMap[WordSparseMatrix$j]
WordSparseMatrix$v <- WordSparseMatrix$v[KeepTermLocs]	
WordSparseMatrix$ncol <- length(KeepTermIDs)
WordSparseMatrix$dimnames$Terms <- WordSparseMatrix$dimnames$Terms[KeepTermIDs]

save(WordSparseMatrix,VoteMatrix,VoteDetails,file="LDA+IRTPaperData.RData")


